library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.0 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(corrplot)
## corrplot 0.84 loaded
library(readxl)
library(ggplot2)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(DT)
housing <- read_excel("Housing.xlsx")
#View(housing)
attach(housing)
datatable(housing, rownames = FALSE) # added a table for visual organization
names(housing)
## [1] "id" "price" "size" "lot"
## [5] "bath" "bedrooms" "yearbuilt" "agestandardized"
## [9] "garagesize" "status" "elem"
glimpse(housing)
## Rows: 76
## Columns: 11
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ price <dbl> 388.0, 450.0, 386.0, 350.0, 155.5, 220.0, 239.5, 207.…
## $ size <dbl> 2.180, 2.054, 2.112, 1.442, 1.800, 1.965, 1.800, 2.25…
## $ lot <dbl> 4, 5, 5, 6, 1, 5, 4, 4, 4, 5, 5, 4, 4, 5, 4, 5, 4, 3,…
## $ bath <dbl> 3.0, 3.0, 2.0, 1.0, 2.0, 2.0, 1.1, 2.0, 2.1, 2.1, 2.0…
## $ bedrooms <dbl> 4, 4, 4, 2, 4, 3, 4, 4, 4, 3, 3, 4, 3, 3, 4, 3, 4, 3,…
## $ yearbuilt <dbl> 1940, 1957, 1955, 1956, 1994, 1940, 1958, 1961, 1965,…
## $ agestandardized <dbl> -3.0, -1.3, -1.5, -1.4, 2.4, -3.0, -1.2, -0.9, -0.5, …
## $ garagesize <dbl> 0, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ status <chr> "sld", "sld", "sld", "act", "sld", "sld", "act", "sld…
## $ elem <chr> "edison", "edison", "edison", "adams", "adams", "adam…
dim(housing)
## [1] 76 11
housing = na.omit(housing)
We removed any rows of data with missing information.
summary(housing)
## id price size lot
## Min. : 1.00 Min. :155.5 Min. :1.440 Min. : 1.000
## 1st Qu.:19.75 1st Qu.:242.8 1st Qu.:1.861 1st Qu.: 3.000
## Median :38.50 Median :276.0 Median :1.966 Median : 4.000
## Mean :38.50 Mean :285.8 Mean :1.970 Mean : 3.987
## 3rd Qu.:57.25 3rd Qu.:336.8 3rd Qu.:2.107 3rd Qu.: 5.000
## Max. :76.00 Max. :450.0 Max. :2.896 Max. :11.000
## bath bedrooms yearbuilt agestandardized
## Min. :1.000 Min. :2.000 Min. :1905 Min. :-6.50000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1958 1st Qu.:-1.22500
## Median :2.000 Median :3.000 Median :1970 Median :-0.05000
## Mean :2.208 Mean :3.447 Mean :1969 Mean :-0.05921
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1980 3rd Qu.: 1.00000
## Max. :3.100 Max. :6.000 Max. :2005 Max. : 3.50000
## garagesize status elem
## Min. :0.000 Length:76 Length:76
## 1st Qu.:1.000 Class :character Class :character
## Median :2.000 Mode :character Mode :character
## Mean :1.566
## 3rd Qu.:2.000
## Max. :3.000
str(housing)
## tibble [76 × 11] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:76] 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num [1:76] 388 450 386 350 156 ...
## $ size : num [1:76] 2.18 2.05 2.11 1.44 1.8 ...
## $ lot : num [1:76] 4 5 5 6 1 5 4 4 4 5 ...
## $ bath : num [1:76] 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ bedrooms : num [1:76] 4 4 4 2 4 3 4 4 4 3 ...
## $ yearbuilt : num [1:76] 1940 1957 1955 1956 1994 ...
## $ agestandardized: num [1:76] -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ garagesize : num [1:76] 0 2 2 1 1 1 1 2 2 2 ...
## $ status : chr [1:76] "sld" "sld" "sld" "act" ...
## $ elem : chr [1:76] "edison" "edison" "edison" "adams" ...
pairs(housing[,2:9]) #price-garage size
cor(housing[,2:9]) #correlation between variables
## price size lot bath bedrooms
## price 1.0000000 0.20143783 0.24423228 0.1746578 -0.2861975
## size 0.2014378 1.00000000 0.04079199 0.4725406 0.2384530
## lot 0.2442323 0.04079199 1.00000000 -0.1709961 -0.2138298
## bath 0.1746578 0.47254061 -0.17099609 1.0000000 0.1468258
## bedrooms -0.2861975 0.23845303 -0.21382977 0.1468258 1.0000000
## yearbuilt 0.1541248 0.17656934 -0.03933975 0.3345239 -0.3631506
## agestandardized 0.1541248 0.17656934 -0.03933975 0.3345239 -0.3631506
## garagesize 0.3583861 0.17315776 0.23581664 0.2017635 -0.4038039
## yearbuilt agestandardized garagesize
## price 0.15412476 0.15412476 0.3583861
## size 0.17656934 0.17656934 0.1731578
## lot -0.03933975 -0.03933975 0.2358166
## bath 0.33452385 0.33452385 0.2017635
## bedrooms -0.36315062 -0.36315062 -0.4038039
## yearbuilt 1.00000000 1.00000000 0.5371846
## agestandardized 1.00000000 1.00000000 0.5371846
## garagesize 0.53718464 0.53718464 1.0000000
Looking at our intial variables compared to price we do not see much of a pattern between the price of the house and other varaibles. When we look at the Cor() we can see that there is a small “strong”(When comparing it to the other r values) positive correlation between price and garagesize at r=.3583861, the following correlation is lot at r=0.24423228 We can see a small “strong” negative correlation when we look at bedrooms. All in all, we will not be able to use a simple linear model to get much of an accurate prediction when we look at the variables individually compared to price.
## looking at price
g1 <- ggplot(housing, aes(x=price)) + geom_density(fill="blue") + ggtitle("Price of House (in thousands of dollars)")
g1
## log transformation of price
housing <- housing %>%
mutate(log_price=log10(price))
ggplot(housing, aes(x=log_price)) + geom_histogram(fill="blue", binwidth = 0.025) + ggtitle("Log Transformation of Price")
# looking at size
g2 <- ggplot(housing, aes(x=size)) + geom_density(fill="red") +
ggtitle("Size of House (in thousands of sqft")
g2
## log transformation of size
housing <- housing %>%
mutate(log_size=log10(size))
ggplot(housing, aes(x=log_size)) + geom_histogram(fill="red", binwidth=0.025) + ggtitle("Log Transformation of Variable Size")
## looking at "lot"
g3 <- ggplot(housing, aes(x=lot)) + geom_density(fill="green") + ggtitle("Size of Lot")
g3
## log transformation of "lot"
housing <- housing %>%
mutate(log_lot=log10(lot))
ggplot(housing, aes(x=log_lot)) + geom_histogram(fill="green", binwidth=0.025) + ggtitle("Log Transformation of Variable Lot")
## looking at relationship between Log_Price and Log Size
ggplot(housing,aes(x=log_size,y=log_price))+geom_point(size=0.5)+
geom_smooth(method="lm",se=F,alpha=0.6,size=0.5,color="black")+ scale_color_manual(values =rainbow(n=6))+ggtitle("Relationship Between `log_price` and `log_size`")
## `geom_smooth()` using formula 'y ~ x'
## looking at relationship between Log_Price and Log_Lot
ggplot(housing,aes(x=log_lot,y=log_price))+geom_point(size=0.5)+
geom_smooth(method="lm",se=F,alpha=0.6,size=0.5,color="black")+ scale_color_manual(values =rainbow(n=6))+ggtitle("Relationship Between `log_price` and `log_lot`")
## `geom_smooth()` using formula 'y ~ x'
# looking at log_price based on status
ggplot(housing,aes(factor(status),log_price,fill=factor(status)))+
geom_boxplot(alpha=0.6)+scale_fill_manual(values=rainbow(6))+
theme(legend.position="none")+
labs(x="Status")
## looking at log_price based on bedrooms
ggplot(housing,aes(factor(bedrooms),log_price,fill=factor(bedrooms)))+
geom_boxplot(alpha=0.6)+scale_fill_manual(values=rainbow(6))+
theme(legend.position="none")+
labs(x="Bedrooms")
## looking at log_price based on bathrooms
ggplot(housing,aes(factor(bath),log_price,fill=factor(bath)))+
geom_boxplot(alpha=0.6)+scale_fill_manual(values=rainbow(6))+
theme(legend.position="none")+
labs(x="Bathrooms")
## looking at log_price based on garage size
ggplot(housing,aes(factor(garagesize),log_price,fill=factor(garagesize)))+
geom_boxplot(alpha=0.6)+scale_fill_manual(values=rainbow(6))+
theme(legend.position="none")+
labs(x="Garage Size")
## looking at log_price and log_size based on variable status
ggplot(housing,aes(x=log_size,y=log_price,color=factor(status)))+geom_point(size=0.3)+
geom_smooth(method="lm",se=F,alpha=0.6,size=0.5,color="black")+ scale_color_manual(values =rainbow(n=12))+
facet_wrap(~status)+
theme(legend.position="none")
## `geom_smooth()` using formula 'y ~ x'
## plotting the distribution of yearbuilt
housing %>%
ggplot(aes(yearbuilt))+geom_histogram(binwidth=5,fill=rainbow(1 ),alpha=0.5)+
scale_x_continuous(limits=c(1905,2005))
## Warning: Removed 2 rows containing missing values (geom_bar).
## correlation plot betweeen price size lot bath and bed
plot1 <-ggpairs(data=housing, columns=2:6,
mapping = aes(color = "dark green"),
axisLabels="show")
plot1
model1<-lm(price ~ size + lot + bath + bedrooms + yearbuilt + agestandardized + garagesize, data = housing)
plot(model1)
summary(model1)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + yearbuilt +
## agestandardized + garagesize, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.63 -37.84 -5.10 39.59 141.28
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 943.9649 692.9278 1.362 0.1775
## size 49.0786 35.7330 1.373 0.1740
## lot 5.2912 4.1863 1.264 0.2105
## bath 17.3304 13.6654 1.268 0.2090
## bedrooms -23.1674 10.6429 -2.177 0.0329 *
## yearbuilt -0.3870 0.3531 -1.096 0.2768
## agestandardized NA NA NA NA
## garagesize 17.8106 10.6500 1.672 0.0990 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.9 on 69 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721
## F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
initialmod <-lm(price ~ size + lot + bath + bedrooms + yearbuilt + garagesize, data = housing)
plot(initialmod)
summary(initialmod)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + yearbuilt +
## garagesize, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.63 -37.84 -5.10 39.59 141.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 943.9649 692.9278 1.362 0.1775
## size 49.0786 35.7330 1.373 0.1740
## lot 5.2912 4.1863 1.264 0.2105
## bath 17.3304 13.6654 1.268 0.2090
## bedrooms -23.1674 10.6429 -2.177 0.0329 *
## yearbuilt -0.3870 0.3531 -1.096 0.2768
## garagesize 17.8106 10.6500 1.672 0.0990 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.9 on 69 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721
## F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
model2 <-lm(price ~ size + lot + bath + bedrooms + agestandardized + garagesize, data = housing)
plot(model2)
summary(model2)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + agestandardized +
## garagesize, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.63 -37.84 -5.10 39.59 141.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.482 64.574 2.810 0.00643 **
## size 49.079 35.733 1.373 0.17405
## lot 5.291 4.186 1.264 0.21051
## bath 17.330 13.665 1.268 0.20899
## bedrooms -23.167 10.643 -2.177 0.03292 *
## agestandardized -3.870 3.531 -1.096 0.27682
## garagesize 17.811 10.650 1.672 0.09898 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.9 on 69 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721
## F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
Removing “yearbuilt” from the initial model did not increase the model’s predictive ability. It would seem that “yearbuilt” and “agestandardized” represent the same factor in different ways and removing “yearbuilt” helped to reduce overfitting.
model3 <-lm(price ~ size + lot + bath + bedrooms + yearbuilt + garagesize, data = housing)
plot(model3)
summary(model3)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + yearbuilt +
## garagesize, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.63 -37.84 -5.10 39.59 141.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 943.9649 692.9278 1.362 0.1775
## size 49.0786 35.7330 1.373 0.1740
## lot 5.2912 4.1863 1.264 0.2105
## bath 17.3304 13.6654 1.268 0.2090
## bedrooms -23.1674 10.6429 -2.177 0.0329 *
## yearbuilt -0.3870 0.3531 -1.096 0.2768
## garagesize 17.8106 10.6500 1.672 0.0990 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.9 on 69 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721
## F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
Taking the alternative route, and removing “agestandardised” and leaving the “yearbuilt” has the same results as the other way around. They can be used as substitutes to each other.
model4 <-lm(price ~ size + lot + bath + bedrooms + garagesize, data = housing)
plot(model4)
summary(model4)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + garagesize,
## data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.048 -41.764 -7.633 40.100 148.517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 187.689 64.417 2.914 0.00479 **
## size 46.034 35.676 1.290 0.20119
## lot 6.156 4.117 1.495 0.13933
## bath 13.570 13.247 1.024 0.30918
## bedrooms -19.622 10.154 -1.932 0.05736 .
## garagesize 13.118 9.766 1.343 0.18352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.98 on 70 degrees of freedom
## Multiple R-squared: 0.225, Adjusted R-squared: 0.1697
## F-statistic: 4.065 on 5 and 70 DF, p-value: 0.002686
Removing both “yearbuilt” “abd agestandardised” resulted in lowered multiple R squared, adjusted R squared and p-values. It seems that they bring down the predictive ability of the model.
#removing "garagesize" on the basis of it having the largest Pr(>|t|) value in model4
model5 <-lm(price ~ size + lot + bath + bedrooms, data = housing)
plot(model5)
summary(model5)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.405 -40.046 -3.371 41.575 152.769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 200.492 64.068 3.129 0.00254 **
## size 54.080 35.368 1.529 0.13070
## lot 7.210 4.065 1.774 0.08040 .
## bath 17.401 13.009 1.338 0.18528
## bedrooms -25.648 9.161 -2.800 0.00658 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.29 on 71 degrees of freedom
## Multiple R-squared: 0.2051, Adjusted R-squared: 0.1603
## F-statistic: 4.579 on 4 and 71 DF, p-value: 0.002398
Removing “garagesize” also had the effect of lowering multiple R squared, adjusted R squared and p-values. It too was harming the modles predictive abilities.
#removing "bath" on the basis of it having the largest Pr(>|t|) value in model5
model6 <-lm(price ~ size + lot + bedrooms, data = housing)
plot(model6)
summary(model6)
##
## Call:
## lm(formula = price ~ size + lot + bedrooms, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.258 -44.153 -6.192 43.378 165.910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 199.531 64.414 3.098 0.00278 **
## size 76.630 31.262 2.451 0.01667 *
## lot 6.053 3.993 1.516 0.13397
## bedrooms -25.776 9.210 -2.799 0.00658 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.59 on 72 degrees of freedom
## Multiple R-squared: 0.185, Adjusted R-squared: 0.1511
## F-statistic: 5.449 on 3 and 72 DF, p-value: 0.001967
It can be seen that multiple R squared, adjusted R squared and p-values continued to fall between “model5” and “model6.”
#removing "lot" on the basis of it having the largest Pr(>|t|) value in model6
model7 <-lm(price ~ size + bedrooms, data = housing)
plot(model7)
summary(model7)
##
## Call:
## lm(formula = price ~ size + bedrooms, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.266 -42.749 -6.262 40.138 173.437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 225.717 62.603 3.606 0.000566 ***
## size 81.215 31.391 2.587 0.011666 *
## bedrooms -28.992 9.042 -3.206 0.001995 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.08 on 73 degrees of freedom
## Multiple R-squared: 0.159, Adjusted R-squared: 0.136
## F-statistic: 6.902 on 2 and 73 DF, p-value: 0.001797
Once again, another predictor was removed, “lot,” and the multiple R squared, adjusted R squared and p-values inched even lower. In the begining it seemed like a complex model wit a multitude of predictors would be useful in predicting “price,” but it doesn’t seem to be so clear.
It can be seen that “size” has a higher Pr(>|t|) than “bedrooms” in model 7. Upon further investigation, plot1 down below, shows that “size” and “bedrooms” are more correlated that “size” and “price.”
#removing "size" on the basis of the above statement.
model8 <-lm(price ~ bedrooms, data = housing)
plot(model8)
summary(model8)
##
## Call:
## lm(formula = price ~ bedrooms, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -117.356 -40.417 -7.213 40.480 177.144
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 366.512 32.116 11.412 <2e-16 ***
## bedrooms -23.414 9.112 -2.569 0.0122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.2 on 74 degrees of freedom
## Multiple R-squared: 0.08191, Adjusted R-squared: 0.0695
## F-statistic: 6.602 on 1 and 74 DF, p-value: 0.0122
#Adding "lot" back because it is highly correlated with "price" as can be seen in "plot1"
model9 <-lm(price ~ bedrooms + lot, data = housing)
plot(model9)
summary(model9)
##
## Call:
## lm(formula = price ~ bedrooms + lot, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.861 -39.817 -7.952 43.778 168.198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 327.038 39.274 8.327 3.42e-12 ***
## bedrooms -20.059 9.211 -2.178 0.0327 *
## lot 7.000 4.109 1.704 0.0927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.46 on 73 degrees of freedom
## Multiple R-squared: 0.117, Adjusted R-squared: 0.09282
## F-statistic: 4.837 on 2 and 73 DF, p-value: 0.01065
#removing "bedrooms" from model9 for expirament
model10 <-lm(price ~ lot, data = housing)
plot(model10)
summary(model10)
##
## Call:
## lm(formula = price ~ lot, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103.674 -44.926 -9.456 48.240 155.174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 250.261 17.738 14.108 <2e-16 ***
## lot 8.913 4.114 2.167 0.0335 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.9 on 74 degrees of freedom
## Multiple R-squared: 0.05965, Adjusted R-squared: 0.04694
## F-statistic: 4.694 on 1 and 74 DF, p-value: 0.03349
A model predicting “price” solely from “lot” is not optimal because of its large p-value, and it has the smallest R values thus far.
A model of “bedroom” and “lot” doesn’t seem to
model1 Residual standard error: 54.9 on 69 degrees of freedom Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721 F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
model2 Residual standard error: 54.9 on 69 degrees of freedom Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721 F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
model3 Residual standard error: 54.9 on 69 degrees of freedom Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721 F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
model4 Residual standard error: 54.98 on 70 degrees of freedom Multiple R-squared: 0.225, Adjusted R-squared: 0.1697 F-statistic: 4.065 on 5 and 70 DF, p-value: 0.002686
model5 Residual standard error: 55.29 on 71 degrees of freedom Multiple R-squared: 0.2051, Adjusted R-squared: 0.1603 F-statistic: 4.579 on 4 and 71 DF, p-value: 0.002398
model6 Residual standard error: 55.59 on 72 degrees of freedom Multiple R-squared: 0.185, Adjusted R-squared: 0.1511 F-statistic: 5.449 on 3 and 72 DF, p-value: 0.001967
model7 Residual standard error: 56.08 on 73 degrees of freedom Multiple R-squared: 0.159, Adjusted R-squared: 0.136 F-statistic: 6.902 on 2 and 73 DF, p-value: 0.001797
model8 Residual standard error: 58.2 on 74 degrees of freedom Multiple R-squared: 0.08191, Adjusted R-squared: 0.0695 F-statistic: 6.602 on 1 and 74 DF, p-value: 0.0122
model9 Residual standard error: 57.46 on 73 degrees of freedom Multiple R-squared: 0.117, Adjusted R-squared: 0.09282 F-statistic: 4.837 on 2 and 73 DF, p-value: 0.01065
After analyzing each model we removed predictor variables that were not statistically significant, p-value greater than our alpha = 0.05
After comparing models 7, 8, and 9 it is aparent that 8 is not as good of a model as the other two. Upon further consideration of the fact that all of the p-values were below 0.05 greater weight was placed on the adjusted R squared values. Models 2 and 3 came into favor. Model 2 seems to be the most optimal model because compromises between lost adjsted R-squared value and a lower p-value. The Multiple R squared value in much closer to 0 than to 1, so the data doesn’t cover alot of the variation, but the low p-value shows that it is statistically significant. Model2 has been renamed finalmod for the purposes of the conclusion.
finalmod <-lm(price ~ size + lot + bath + bedrooms + agestandardized + garagesize, data = housing)
plot(finalmod)
summary(finalmod)
##
## Call:
## lm(formula = price ~ size + lot + bath + bedrooms + agestandardized +
## garagesize, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.63 -37.84 -5.10 39.59 141.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.482 64.574 2.810 0.00643 **
## size 49.079 35.733 1.373 0.17405
## lot 5.291 4.186 1.264 0.21051
## bath 17.330 13.665 1.268 0.20899
## bedrooms -23.167 10.643 -2.177 0.03292 *
## agestandardized -3.870 3.531 -1.096 0.27682
## garagesize 17.811 10.650 1.672 0.09898 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.9 on 69 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.1721
## F-statistic: 3.598 on 6 and 69 DF, p-value: 0.003672
confint(finalmod, level=.95)
## 2.5 % 97.5 %
## (Intercept) 52.661508 310.302912
## size -22.206804 120.363966
## lot -3.060277 13.642710
## bath -9.931335 44.592134
## bedrooms -44.399478 -1.935417
## agestandardized -10.914528 3.173587
## garagesize -3.435653 39.056812
NewHouseForSale <- data.frame(size=2.5, lot= 3, bath=3, bedrooms=3, agestandardized=0, garagesize=1)
predict(finalmod, newdata=NewHouseForSale, interval = 'confidence')
## fit lwr upr
## 1 320.3517 273.3653 367.3382
Using finalmod we input the information of our new house going up in the market as a data frame. Our house we’re going to be using is a house of size 2.5 with a lot of 3 acres, 3 bathrooms, 3 bedrooms, from the year 1970, hence 0 agestandarized value, and a garage size of 1. The mean price of the house appears to be $320.4k. We are 95% confident that our NewHouseForSale should sell between $273.4K and $367.1K